import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import joblib
import os
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif']=['SimHei'] #显示中文
# 设置Seaborn样式
sns.set(font='Microsoft YaHei')
d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\_distributor_init.py:32: UserWarning: loaded more than 1 DLL from .libs: d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll stacklevel=1)
# 获取字典保存各个模型的最终结果
result_model = {}
# target = ['TN loss (%)', 'NH3-N loss (%)', 'N2O-N loss (%)']
target = 'NH3-N loss (%)' # 要预测啥就换个名字
# target = ['EF %', 'LN(EF)', 'LNR(N2O)', 'N2O rate(kg N ha-1 y-1)']
output_file = 'output/TN_NH3_N2O'
input_file = 'data/TN_NH3_N2O'
model_save_file = f'output/TN_NH3_N2O/model_{target}'
os.makedirs(output_file, exist_ok=True)
os.makedirs(model_save_file, exist_ok=True)
data_path = f'{input_file}/data_for_{target}.csv'
data_tree_path = f'{input_file}/data_for_tree_{target}.csv'
model_performance_path = f'{output_file}/Model_{target}.html'
model_performance_json = f'{output_file}/result_model_{target}.json'
data_all_ef = pd.read_csv(data_path)
X_all = data_all_ef.iloc[:, :-1]
y_all = data_all_ef.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=2023)
y_train = y_train.reset_index(drop=True)
## 为了正确评估模型性能,将数据划分为训练集和测试集,并在训练集上训练模型,在测试集上验证模型性能。
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)
input_cols = X_all.columns.tolist()
data_all_tree = pd.read_csv(data_tree_path)
X_allt = data_all_tree.iloc[:, :-1]
y_allt = data_all_tree.iloc[:, -1]
X_traint, X_testt, y_traint, y_testt = train_test_split(X_all, y_all, test_size=0.2, random_state=2023)
y_traint = y_traint.reset_index(drop=True)
## 为了正确评估模型性能,将数据划分为训练集和测试集,并在训练集上训练模型,在测试集上验证模型性能。
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 669 entries, 545 to 537 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 material_0 669 non-null int64 1 initial CN(%) 669 non-null float64 2 initial moisture content(%) 669 non-null float64 3 initial pH 669 non-null float64 4 material_1 669 non-null int64 5 Excipients 669 non-null int64 6 initial TN(%) 669 non-null float64 7 initial TC(%) 669 non-null float64 8 Additive Species 669 non-null int64 dtypes: float64(5), int64(4) memory usage: 52.3 KB
X_train.describe()
| material_0 | initial CN(%) | initial moisture content(%) | initial pH | material_1 | Excipients | initial TN(%) | initial TC(%) | Additive Species | |
|---|---|---|---|---|---|---|---|---|---|
| count | 669.000000 | 669.000000 | 669.000000 | 669.000000 | 669.000000 | 669.000000 | 669.000000 | 669.000000 | 669.000000 |
| mean | 3.292975 | 13.719288 | 50.376727 | 4.942949 | 2.887892 | 55.754858 | -0.119390 | 11.450745 | 3.608371 |
| std | 1.445920 | 11.782051 | 29.399692 | 3.882304 | 1.292882 | 26.505163 | 1.972162 | 31.433285 | 0.928099 |
| min | 0.000000 | -1.000000 | -1.000000 | -1.000000 | 0.000000 | 0.000000 | -1.000000 | -1.000000 | 0.000000 |
| 25% | 3.000000 | -1.000000 | 48.000000 | -1.000000 | 3.000000 | 38.000000 | -1.000000 | -1.000000 | 4.000000 |
| 50% | 3.000000 | 15.390000 | 63.000000 | 6.900000 | 3.000000 | 71.000000 | -1.000000 | -1.000000 | 4.000000 |
| 75% | 5.000000 | 21.373333 | 70.000000 | 7.800000 | 4.000000 | 78.000000 | -1.000000 | -1.000000 | 4.000000 |
| max | 6.000000 | 53.008273 | 88.500000 | 10.320000 | 4.000000 | 78.000000 | 14.200000 | 197.000000 | 4.000000 |
X_train
| material_0 | initial CN(%) | initial moisture content(%) | initial pH | material_1 | Excipients | initial TN(%) | initial TC(%) | Additive Species | |
|---|---|---|---|---|---|---|---|---|---|
| 545 | 6 | 41.00000 | 56.900000 | 7.83 | 4 | 78 | 3.9 | 159.9 | 4 |
| 175 | 3 | -1.00000 | -1.000000 | 6.85 | 2 | 78 | -1.0 | -1.0 | 3 |
| 11 | 3 | -1.00000 | -1.000000 | -1.00 | 3 | 78 | -1.0 | -1.0 | 0 |
| 121 | 3 | -1.00000 | -1.000000 | -1.00 | 0 | 78 | -1.0 | -1.0 | 3 |
| 714 | 5 | 13.56000 | 71.050000 | 8.08 | 4 | 45 | -1.0 | -1.0 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 515 | 5 | 18.40000 | 55.300000 | 6.44 | 4 | 49 | -1.0 | -1.0 | 4 |
| 732 | 3 | 6.10000 | 71.000000 | 6.70 | 2 | 12 | -1.0 | -1.0 | 4 |
| 695 | 0 | 37.83542 | 65.276961 | 7.30 | 4 | 78 | -1.0 | -1.0 | 4 |
| 454 | 3 | 16.60000 | 53.800000 | 7.90 | 2 | 13 | -1.0 | -1.0 | 4 |
| 537 | 6 | -1.00000 | 70.000000 | 8.40 | 4 | 78 | -1.0 | -1.0 | 4 |
669 rows × 9 columns
y_train.isnull().any()
False
print(X_train.isnull().any())
material_0 False initial CN(%) False initial moisture content(%) False initial pH False material_1 False Excipients False initial TN(%) False initial TC(%) False Additive Species False dtype: bool
np.isnan(X_train).any()
material_0 False initial CN(%) False initial moisture content(%) False initial pH False material_1 False Excipients False initial TN(%) False initial TC(%) False Additive Species False dtype: bool
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=1000, criterion='mse',
max_depth=10, min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_features='auto', max_leaf_nodes=None,
bootstrap=True, oob_score=False,
n_jobs=1, random_state=None,
verbose=0, warm_start=False)
rf_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
rf_model.fit(x_tr, y_tr)
rf_pred_valid = rf_model.predict(x_va)
rf_preds[valid_index] = rf_pred_valid
print('The rmse of the RFRegression is:',metrics.mean_squared_error(y_va,rf_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, rf_pred_valid, 'r', label='xgb predict')
plt.legend(loc='best')
plt.title('xgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(535, 9) (535,) The rmse of the RFRegression is: 72.33406927962925
(535, 9) (535,) The rmse of the RFRegression is: 120.24513283097679
(535, 9) (535,) The rmse of the RFRegression is: 57.01293697537265
(535, 9) (535,) The rmse of the RFRegression is: 72.7852758294013
(536, 9) (536,) The rmse of the RFRegression is: 47.14527417335798
rf_pred_test = rf_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,rf_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, rf_pred_test, 'r', label='rf predict')
plt.legend(loc='best')
plt.title('rf-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(rf_model, f'{model_save_file}/rf_model.pkl')
result_model['RandomForest(k)'] = metrics.mean_squared_error(y_test,rf_pred_test)
The rmse of the test is: 87.06266377717145
import xgboost as xgb
xgb_preds = np.zeros(X_train.shape[0])
kf = KFold(n_splits=num_folds, shuffle=True)
xgb_model = xgb.XGBRegressor(max_depth=8,
learning_rate=0.1,
n_estimators=300,
n_jobs=4,
colsample_bytree=0.8,
subsample=0.8,
random_state=32,
tree_method='hist'
)
for train_index, valid_index in kf.split(X_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
xgb_model.fit(x_tr, y_tr)
xgb_pred_valid = xgb_model.predict(x_va)
xgb_preds[valid_index] = xgb_pred_valid
# print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
print('The rmse of the XgbRegression is:',metrics.mean_squared_error(y_va,xgb_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, xgb_pred_valid, 'r', label='xgb predict')
plt.legend(loc='best')
plt.title('xgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(535, 9) (535,) The rmse of the XgbRegression is: 73.41349618255467
(535, 9) (535,) The rmse of the XgbRegression is: 91.57950901758214
(535, 9) (535,) The rmse of the XgbRegression is: 74.12756581448943
(535, 9) (535,) The rmse of the XgbRegression is: 87.34311628843
(536, 9) (536,) The rmse of the XgbRegression is: 79.56812684878001
xgb_pred_test = xgb_model.predict(X_test)
print('The rmse of the XgbRegression is:',metrics.mean_squared_error(y_test, xgb_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, xgb_pred_test, 'r', label='xgb predict')
plt.legend(loc='best')
plt.title('xgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(xgb_model, f'{model_save_file}/xgb_model.pkl')
result_model['XGBoost(k)'] = metrics.mean_squared_error(y_test,xgb_pred_test)
The rmse of the XgbRegression is: 94.74935148490458
import lightgbm as lgb
lgb_preds = np.zeros(X_traint.shape[0])
for i, (train_index, valid_index) in enumerate(kf.split(X_traint, y_traint)):
print('************************************ {} ************************************'.format(str(i+1)))
X_tr, y_tr, X_va, y_va = X_traint.iloc[train_index], \
y_traint.iloc[train_index], X_traint.iloc[valid_index], y_traint.iloc[valid_index]
params_lgb = {
'boosting_type': 'gbdt',
'objective': 'regression',
'learning_rate': 0.1,
'n_estimators': 300,
'metric': 'root_mean_squared_error',
'min_child_weight': 1e-3,
'min_child_samples': 10,
'num_leaves': 31,
'max_depth': -1,
'seed': 2023,
'verbose': -1,
}
lgb_model = lgb.LGBMRegressor(**params_lgb)
lgb_model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_va, y_va)], eval_metric=['mae','rmse'])
lgb_val_pred = lgb_model.predict(X_va)
lgb_preds[valid_index] = lgb_val_pred
print('The rmse of the LgbRegression is:',metrics.mean_squared_error(y_va,lgb_val_pred))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, lgb_val_pred, 'r', label='lgb predict')
plt.legend(loc='best')
plt.title('lgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
************************************ 1 ************************************ The rmse of the LgbRegression is: 63.47356331282958
************************************ 2 ************************************ The rmse of the LgbRegression is: 57.67203190953856
************************************ 3 ************************************ The rmse of the LgbRegression is: 62.06630210427079
************************************ 4 ************************************ The rmse of the LgbRegression is: 115.09563250500037
************************************ 5 ************************************ The rmse of the LgbRegression is: 80.12872820839479
lgb_pred_test = lgb_model.predict(X_testt)
print('The rmse of the test is:',metrics.mean_squared_error(y_testt,lgb_pred_test))
x = list(range(0, len(y_testt)))
plt.figure(figsize=(40,10))
plt.plot(x[:100], y_testt[:100], 'g', label='ground-turth')
plt.plot(x[:100], lgb_pred_test[:100], 'r', label='lgb predict')
plt.legend(loc='best')
plt.title('lgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(lgb_model, f'{model_save_file}/lgb_model.pkl')
result_model['Lightgbm(k)'] = metrics.mean_squared_error(y_testt,lgb_pred_test)
The rmse of the test is: 90.01997376916715
from catboost import CatBoostRegressor
params = {
'learning_rate': 0.05,
'loss_function': "RMSE",
'eval_metric': "RMSE", # CrossEntropy
'depth': 8,
'min_data_in_leaf': 20,
'random_seed': 42,
'logging_level': 'Silent',
'use_best_model': True,
'one_hot_max_size': 5, #类别数量多于此数将使用ordered target statistics编码方法,默认值为2。
'boosting_type':"Ordered", #Ordered 或者Plain,数据量较少时建议使用Ordered,训练更慢但能够缓解梯度估计偏差。
'max_ctr_complexity': 2, #特征组合的最大特征数量,设置为1取消特征组合,设置为2只做两个特征的组合,默认为4。
'nan_mode': 'Min'
}
iterations = 400
early_stopping_rounds = 200
ctb_model = CatBoostRegressor(iterations=iterations,
early_stopping_rounds = early_stopping_rounds,
**params)
for i, (train_index, valid_index) in enumerate(kf.split(X_traint)):
print('************************************ {} ************************************'.format(str(i+1)))
x_tr, x_va = X_traint.iloc[train_index], X_traint.iloc[valid_index]
y_tr, y_va = y_traint.iloc[train_index], y_traint.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
ctb_model.fit(x_tr, y_tr, eval_set=(x_va, y_va), verbose=0, early_stopping_rounds=1000)
ctb_pre= ctb_model.predict(x_va)
print('The rmse of the CatRegression is:',metrics.mean_squared_error(y_va,ctb_pre))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, ctb_pre, 'r', label='cat predict')
plt.legend(loc='best')
plt.title('cat-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
************************************ 1 ************************************ (535, 9) (535,) The rmse of the CatRegression is: 65.41703351619354
************************************ 2 ************************************ (535, 9) (535,) The rmse of the CatRegression is: 79.09693900072871
************************************ 3 ************************************ (535, 9) (535,) The rmse of the CatRegression is: 60.165751627339866
************************************ 4 ************************************ (535, 9) (535,) The rmse of the CatRegression is: 73.7228345960462
************************************ 5 ************************************ (536, 9) (536,) The rmse of the CatRegression is: 72.28120593473213
cat_pred_test = ctb_model.predict(X_testt)
print('The rmse of the test is:',metrics.mean_squared_error(y_testt,cat_pred_test))
x = list(range(0, len(y_testt)))
plt.figure(figsize=(40,10))
plt.plot(x, y_testt, 'g', label='ground-turth')
plt.plot(x, cat_pred_test, 'r', label='cat predict')
plt.legend(loc='best')
plt.title('cat-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(ctb_model, f'{model_save_file}/ctb_model.pkl')
result_model['Catboost(k)'] = metrics.mean_squared_error(y_test,cat_pred_test)
The rmse of the test is: 99.7637237685675
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
ri_model = Ridge(alpha=0.5,fit_intercept=True)
lr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
ri_model.fit(x_tr, y_tr)
lr_pred_valid = ri_model.predict(x_va)
lr_preds[valid_index] = lr_pred_valid
print('The rmse of the LR is:',metrics.mean_squared_error(y_va,lr_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, lr_pred_valid, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(535, 9) (535,) The rmse of the LR is: 118.5247762940648
(535, 9) (535,) The rmse of the LR is: 91.32071794094006
(535, 9) (535,) The rmse of the LR is: 120.4805198773532
(535, 9) (535,) The rmse of the LR is: 102.9025768099061
(536, 9) (536,) The rmse of the LR is: 145.73062685873657
ri_pred_test = ri_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,ri_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, ri_pred_test, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(ri_model, f'{model_save_file}/ri_model.pkl')
result_model['RidgeRegression(k)'] = metrics.mean_squared_error(y_test,ri_pred_test)
The rmse of the test is: 153.13336047251556
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
lr_model = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)
lr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
lr_model.fit(x_tr, y_tr)
lr_pred_valid = lr_model.predict(x_va)
lr_preds[valid_index] = lr_pred_valid
# print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
print('The rmse of the LR is:',metrics.mean_squared_error(y_va,lr_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, lr_pred_valid, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(535, 9) (535,) The rmse of the LR is: 113.31311205133704
(535, 9) (535,) The rmse of the LR is: 93.11116165851249
(535, 9) (535,) The rmse of the LR is: 105.81811908648206
(535, 9) (535,) The rmse of the LR is: 167.33796204169673
(536, 9) (536,) The rmse of the LR is: 103.7560669453761
lr_pred_test = lr_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,lr_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, lr_pred_test, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(lr_model, f'{model_save_file}/lr_model.pkl')
result_model['LinearRegression(k)'] = metrics.mean_squared_error(y_test,lr_pred_test)
The rmse of the test is: 152.84409567861422
from sklearn.neural_network import MLPRegressor
mlp_model = MLPRegressor(solver='lbfgs',alpha=1e-5, hidden_layer_sizes=(40,40), max_iter=500, random_state=2023)
mlp_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
mlp_model.fit(x_tr, y_tr)
mlp_pred_valid = mlp_model.predict(x_va)
mlp_preds[valid_index] = mlp_pred_valid
# print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
print('The rmse of the MLP is:',metrics.mean_squared_error(y_va,mlp_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, mlp_pred_valid, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(535, 9) (535,) The rmse of the MLP is: 64.54947984165855
(535, 9) (535,) The rmse of the MLP is: 100.48187981995927
(535, 9) (535,) The rmse of the MLP is: 91.98040411391537
(535, 9) (535,) The rmse of the MLP is: 146.4392203169691
(536, 9) (536,) The rmse of the MLP is: 100.49902790636249
mlp_pred_test = mlp_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,mlp_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, mlp_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(mlp_model, f'{model_save_file}/mlp_model.pkl')
result_model['MLP(k)'] = metrics.mean_squared_error(y_test,mlp_pred_test)
The rmse of the test is: 142.08429708042357
from sklearn.svm import SVR
svr_model = SVR(kernel ='rbf',
degree = 3,
coef0 = 0.0,
tol = 0.001,
C = 1.0,
epsilon = 0.1,
shrinking = True,
cache_size = 200,
verbose = False,
max_iter = -1)
svr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
svr_model.fit(x_tr, y_tr)
svr_pred_valid = svr_model.predict(x_va)
svr_preds[valid_index] = svr_pred_valid
# print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
print('The rmse of the LR is:',metrics.mean_squared_error(y_va,svr_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, svr_pred_valid, 'r', label='svr predict')
plt.legend(loc='best')
plt.title('svr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(535, 9) (535,) The rmse of the LR is: 138.4509977311952
(535, 9) (535,) The rmse of the LR is: 112.88574994132262
(535, 9) (535,) The rmse of the LR is: 97.6472119945676
(535, 9) (535,) The rmse of the LR is: 100.71330341737936
(536, 9) (536,) The rmse of the LR is: 148.6798739815513
svr_pred_test = svr_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,svr_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, svr_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(svr_model, f'{model_save_file}/svr_model.pkl')
result_model['SVR(k)'] = metrics.mean_squared_error(y_test,svr_pred_test)
The rmse of the test is: 154.50148948624263
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
# 创建高斯过程回归模型对象
gp_model = GaussianProcessRegressor(kernel=RBF())
gp_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
print(x_tr.shape, y_tr.shape)
gp_model.fit(x_tr, y_tr)
gp_pred_valid = gp_model.predict(x_va)
gp_preds[valid_index] = gp_pred_valid
print('The rmse of the LR is:',metrics.mean_squared_error(y_va,gp_pred_valid))
x = list(range(0, len(y_va)))
plt.figure(figsize=(40,10))
plt.plot(x, y_va, 'g', label='ground-turth')
plt.plot(x, gp_pred_valid, 'r', label='gp predict')
plt.legend(loc='best')
plt.title('gp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
(535, 9) (535,) The rmse of the LR is: 178.81596682846134
(535, 9) (535,) The rmse of the LR is: 242.78209000036935
(535, 9) (535,) The rmse of the LR is: 231.17116404877655
(535, 9) (535,) The rmse of the LR is: 240.36255291792932
(536, 9) (536,) The rmse of the LR is: 168.86409456256115
gp_pred_test = gp_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,gp_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, gp_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()
joblib.dump(gp_model, f'{model_save_file}/gp_model.pkl')
result_model['GR(k)'] = metrics.mean_squared_error(y_test,gp_pred_test)
The rmse of the test is: 259.07925065377
import json
# 将字典保存为 JSON 文件
with open(model_performance_json, 'w') as json_file:
json.dump(result_model, json_file)
import plotly.express as px
import plotly.io as pio
with open(model_performance_json, 'r') as json_file:
result_model = json.load(json_file)
result_model = dict(sorted(result_model.items(), key=lambda item: item[1]))
categories = list(result_model.keys())
values = list(result_model.values())
color_mapping = {}
for k, v in result_model.items():
if v < 0.4:
color_mapping[k] = "Tree-Base"
else:
color_mapping[k] = "Other"
# 创建柱状图
fig = px.bar(x=categories, y=values, title='Models Performance in ln(EF)', color=color_mapping)
fig.update_layout(template="seaborn")
# 显示图表
fig.show()
# 保存柱状图为 HTML 文件
pio.write_html(fig, file=model_performance_path)
import shap
shap.initjs()
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))
colors=px.colors.qualitative.Plotly
feat_importance=pd.DataFrame()
feat_importance["Importance"]=rf_model.feature_importances_
feat_importance.set_index(X_train.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]
fig=go.Figure()
for i in range(len(feat_importance.index)):
fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i],
line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers',
marker_color=pal[::-1], marker_size=8,
hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance',
xaxis=dict(title='Average Importance',zeroline=False),
yaxis_showgrid=False, margin=dict(l=120,t=80),
height=700, width=800)
fig.show()
feat_importance=pd.DataFrame()
feat_importance["Importance"]=xgb_model.feature_importances_
feat_importance.set_index(X_train.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]
fig=go.Figure()
for i in range(len(feat_importance.index)):
fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i],
line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers',
marker_color=pal[::-1], marker_size=8,
hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance',
xaxis=dict(title='Average Importance',zeroline=False),
yaxis_showgrid=False, margin=dict(l=120,t=80),
height=700, width=800)
fig.show()
feat_importance=pd.DataFrame()
feat_importance["Importance"]=ctb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]
fig=go.Figure()
for i in range(len(feat_importance.index)):
fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i],
line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers',
marker_color=pal[::-1], marker_size=8,
hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance',
xaxis=dict(title='Average Importance',zeroline=False),
yaxis_showgrid=False, margin=dict(l=120,t=80),
height=700, width=800)
fig.show()
feat_importance=pd.DataFrame()
feat_importance["Importance"]=lgb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]
fig=go.Figure()
for i in range(len(feat_importance.index)):
fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i],
line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers',
marker_color=pal[::-1], marker_size=8,
hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance',
xaxis=dict(title='Average Importance',zeroline=False),
yaxis_showgrid=False, margin=dict(l=120,t=80),
height=700, width=800)
fig.show()
SHAP是由Shapley value启发的可加性解释模型。对于每个预测样本,模型都产生一个预测值,SHAP value就是该样本中每个特征所分配到的数值。 很明显可以看出,与上一节中feature importance相比,SHAP value最大的优势是SHAP能对于反映出每一个样本中的特征的影响力,而且还表现出影响的正负性。
# 获取feature importance
plt.figure(figsize=(15, 5))
feat_importance=pd.DataFrame()
feat_importance["Importance"]=lgb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance', ascending=False)
plt.bar(range(len(X_traint.columns)), feat_importance['Importance'])
plt.xticks(range(len(X_traint.columns)), feat_importance.index, rotation=90, fontsize=14)
plt.title('Feature importance', fontsize=14)
plt.show()
lgb_explainer = shap.TreeExplainer(lgb_model)
lgb_shap_values = lgb_explainer.shap_values(X_traint)
print(lgb_shap_values.shape)
(669, 9)
shap.force_plot(lgb_explainer.expected_value, lgb_shap_values[0,:], X_traint.iloc[0,:])
对第一个实例的特征贡献图也可用 waterfall 方式展示
shap.plots._waterfall.waterfall_legacy(
lgb_explainer.expected_value,
lgb_shap_values[0],
feature_names=X_traint.columns)
上图的解释显示了每个有助于将模型输出从基值(我们传递的训练数据集上的平均模型输出)贡献到模型输出值的特征。将预测推高的特征以红色显示,将预测推低的特征以蓝色显示。
如果我们采取许多实例来聚合显示解释,如下图所示,将它们旋转 90 度,然后将它们水平堆叠,我们可以看到整个数据集的解释(在 Notebook 中,此图是交互式的)
# visualize the training set predictions
shap.force_plot(lgb_explainer.expected_value, lgb_shap_values, X_traint)
下图中每一行代表一个特征,横坐标为SHAP值。一个点代表一个样本,颜色越红说明特征本身数值越大,颜色越蓝说明特征本身数值越小。
shap.summary_plot(lgb_shap_values, X_traint)
shap value值排序的特征重要性
shap.summary_plot(lgb_shap_values, X_traint, plot_type="bar")
SHAP也提供了部分依赖图的功能,与传统的部分依赖图不同的是,这里纵坐标不是目标变量y的数值而是SHAP值。可以观察各个特征的分布与目标shap值的关系。
fig, axes = plt.subplots(len(input_cols)//3+1, 3, figsize=(20,30))
for i, col in enumerate(input_cols):
shap.dependence_plot(col, lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[i//3,i%3])
# shap.dependence_plot('Sand (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[0,2])
# shap.dependence_plot('Silt (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,0])
# shap.dependence_plot('Clay (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,1])
# shap.dependence_plot('SOC (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,2])
# shap.dependence_plot('TN (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[2,1])
# shap.dependence_plot('C/N', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[2,2])
# shap.dependence_plot('pH', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,0])
# shap.dependence_plot('TN (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,1])
# shap.dependence_plot('BD', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,2])
# shap.dependence_plot('CEC', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,0])
# shap.dependence_plot('N application', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
# shap.dependence_plot('BNE', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
# shap.dependence_plot('MAT (°C)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
# shap.dependence_plot('MAP (mm)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
我们也可以多个变量的交互作用进行分析。一种方式是采用summary_plot描绘出散点图,如下:
shap_interaction_values = shap.TreeExplainer(lgb_model).shap_interaction_values(X_traint)
plt.figure(figsize=(12,12))
shap.summary_plot(shap_interaction_values, X_traint, max_display=6)
plt.show()
<Figure size 1200x1200 with 0 Axes>
我们也可以用dependence_plot描绘两个变量交互下变量对目标值的影响。
shap.dependence_plot(input_cols[0], lgb_shap_values, X_traint, interaction_index=input_cols[1], show=False)
也能可视化每种特征对于整体预测值的影响。
shap.decision_plot(lgb_explainer.expected_value, lgb_shap_values, X_traint, ignore_warnings=True)
feat_importance=pd.DataFrame()
feat_importance["Importance"]=ctb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)
feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]
fig=go.Figure()
for i in range(len(feat_importance.index)):
fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i],
line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers',
marker_color=pal[::-1], marker_size=8,
hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance',
xaxis=dict(title='Average Importance',zeroline=False),
yaxis_showgrid=False, margin=dict(l=120,t=80),
height=700, width=800)
fig.show()